library(DBI)
library(tidyverse)
library(ggrepel)
require(RSQLite)

seasonYear <- 2018  # represents 2018-2019 season

# this filepath specific to my local drive
mainFile <- "./data/archive/basketball.sqlite"

# get all regular season games (only relevant columns 
# selected)
mydb <- dbConnect(RSQLite::SQLite(), mainFile)
df <- dbGetQuery(mydb, "SELECT * FROM Game")
dbDisconnect(mydb)
season_df <- df %>% mutate(GAME_DATE = as.Date(GAME_DATE),
                           SEASON = as.numeric(SEASON)) %>% 
  filter(SEASON == seasonYear) %>%
  select(GAME_DATE, TEAM_NAME_HOME, TEAM_NAME_AWAY, WL_HOME, WL_AWAY) %>%
  arrange(GAME_DATE)

head(season_df)

#    GAME_DATE        TEAM_NAME_HOME         TEAM_NAME_AWAY WL_HOME WL_AWAY
# 1 2018-10-16 Golden State Warriors  Oklahoma City Thunder       W       L
# 2 2018-10-16        Boston Celtics     Philadelphia 76ers       W       L
# 3 2018-10-17     San Antonio Spurs Minnesota Timberwolves       W       L
# 4 2018-10-17       New York Knicks          Atlanta Hawks       W       L
# 5 2018-10-17          Phoenix Suns       Dallas Mavericks       W       L
# 6 2018-10-17           LA Clippers         Denver Nuggets       L       W

# get team abbreviations and names
team_abbrev_df <- df %>% select(team = TEAM_NAME_HOME, 
                                team_abbr = TEAM_ABBREVIATION_HOME) %>%
  distinct()
teams <- sort(unique(season_df$TEAM_NAME_HOME))

# get dataframe for Bradley-Terry model
get_data_vec <- function(home_team, away_team, teams) {
  vec <- rep(0, length(teams))
  vec[teams == home_team] <- 1
  vec[teams == away_team] <- -1
  vec
}
season_df[1,]
X <- apply(season_df, 1, 
           function(row) get_data_vec(row["TEAM_NAME_HOME"], 
                                      row["TEAM_NAME_AWAY"], 
                                      teams))

X[1,]
X <- t(X)
colnames(X) <- teams

dim(X)
# [1] 1230   30

y <- as.numeric(season_df$WL_HOME == "W")
bt_df <- as.data.frame(cbind(X, y))

bt_df %>%
  as_tibble() %>%
  gather(team,home,-y)
# Bradley-Terry model with home advantage

bt_mod <- glm(y ~ ., data = bt_df, family = binomial())

summary(bt_mod)

# Compare BT coefficients with overall win percentage
coef_df <- data.frame(
  team = teams,
  beta = c(summary(bt_mod)$coefficients[2:length(teams), "Estimate"], 0)
)

# get team win percentages
home_df <- season_df %>% group_by(TEAM_NAME_HOME) %>%
  summarize(home_win  = sum(WL_HOME == "W"),
            home_loss = sum(WL_HOME == "L"))
away_df <- season_df %>% group_by(TEAM_NAME_AWAY) %>%
  summarize(away_win  = sum(WL_AWAY == "W"),
            away_loss = sum(WL_AWAY == "L"))
win_pct_df <- inner_join(home_df, away_df, 
                         by = c("TEAM_NAME_HOME" = "TEAM_NAME_AWAY")) %>%
  transmute(team = TEAM_NAME_HOME,
            win = home_win + away_win,
            loss = home_loss + away_loss) %>%
  mutate(win_pct = win / (win + loss)) %>%
  arrange(desc(win_pct)) %>%
  left_join(team_abbrev_df)

win_pct_df %>% inner_join(coef_df) %>%
  ggplot(aes(x = win_pct, y = beta)) +
  geom_point() +
  geom_text_repel(aes(label = team_abbr)) +
  labs(x = "Win percentage", y = "Bradley-Terry beta",
       title = "Bradley-Terry beta vs. Win %")
